import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()
%matplotlib inline
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
df = pd.read_csv("cleaned.csv")
df0 = df
tran = pd.read_csv("train_2016_v2.csv")
#clean up the transaction dates by leaving only the month for easier graphics
yearmonth = []
for i in range(0,len(tran)):
yearmonth.append(tran["transactiondate"][i][0:7])
tran["yearmonth"] = yearmonth
tran =tran.sort_values(by=['yearmonth'])
transaction = pd.merge(tran,df, on='parcelid', how='inner')
transaction =transaction.sort_values(by=['yearmonth'])
#transaction.groupby('fips').count()["parcelid"]
#fips = county code
#6037 = los angeles
#6059 = Orange county
#6111 = Ventura County
jan = transaction[transaction["yearmonth"] == '2016-01'].groupby('fips').count()["parcelid"].values.tolist()
feb = transaction[transaction["yearmonth"] == '2016-02'].groupby('fips').count()["parcelid"].values.tolist()
march = transaction[transaction["yearmonth"] == '2016-03'].groupby('fips').count()["parcelid"].values.tolist()
april = transaction[transaction["yearmonth"] == '2016-04'].groupby('fips').count()["parcelid"].values.tolist()
may = transaction[transaction["yearmonth"] == '2016-05'].groupby('fips').count()["parcelid"].values.tolist()
june = transaction[transaction["yearmonth"] == '2016-06'].groupby('fips').count()["parcelid"].values.tolist()
july = transaction[transaction["yearmonth"] == '2016-07'].groupby('fips').count()["parcelid"].values.tolist()
aug = transaction[transaction["yearmonth"] == '2016-08'].groupby('fips').count()["parcelid"].values.tolist()
sep = transaction[transaction["yearmonth"] == '2016-09'].groupby('fips').count()["parcelid"].values.tolist()
octo = transaction[transaction["yearmonth"] == '2016-10'].groupby('fips').count()["parcelid"].values.tolist()
nov = transaction[transaction["yearmonth"] == '2016-11'].groupby('fips').count()["parcelid"].values.tolist()
dec = transaction[transaction["yearmonth"] == '2016-12'].groupby('fips').count()["parcelid"].values.tolist()
pcts = [jan,feb,march,april, may,june, july,octo, sep, nov, dec]
l1= []
l2 = []
l3 = []
for i in pcts:
l1.append(i[0])
l2.append(i[1])
l3.append(i[2])
import plotly
import plotly.graph_objs as go
x = ["January", "February", "March","April","May", "June","July","August", "September", "October","November","December"]
trace1 = go.Bar(
x=x,
y=l1,
name='Los Angeles'
)
trace2 = go.Bar(
x=x,
y=l2,
name='Orange County'
)
trace3 = go.Bar(
x=x,
y=l3,
name='Ventura County'
)
data = [trace1, trace2,trace3]
layout = go.Layout(
barmode='stack',
title = "Number of Transactions by Month"
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='stacked-bar')
transaction["latitude"] = transaction["latitude"]/1000000
transaction["longitude"] = transaction["longitude"]/1000000
np.random.seed(11452)
train = np.random.rand(len(transaction)) <0.5
trans_train = transaction[train]
#len(trans_train)
import plotly
from plotly.plotly import iplot
import plotly.graph_objs as go
mapbox_access_token = 'pk.eyJ1IjoiYmlwYXJ0aXRlaHlwZXJjdWJlIiwiYSI6ImNqczFjZzUydjF0MGc0OW1sYWIwYW5lY2UifQ.l6OVeSa3poXp6S4s8km8kA'
map_style = "mapbox://styles/bipartitehypercube/cjs26r8sw1cy31fqjhopqigtj"
scl = [0,"rgb(0,0,0)"],[0.25,"rgb(0, 0, 128)"],[0.5,"rgb(0, 25, 255)"],\
[0.75,"rgb(0, 0, 205)"],[1,"rgb(250, 250, 250)"]
data = [go.Scattermapbox(
lat=trans_train['latitude'],
lon=trans_train['longitude'],
mode='markers',
marker = dict(color = (trans_train['taxvaluedollarcnt']),colorscale = scl,reversescale = True,opacity = 0.7,size = 5,
colorbar = dict(thickness = 10,titleside = "right",
outlinecolor = "rgba(68, 68, 68, 0)",
ticks = "outside",ticklen = 3,
showticksuffix = "last",
ticksuffix = " Dollars",dtick = 1000000)))]
layout = go.Layout(
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(lat=34,lon=-118.5),
style= map_style,
pitch=0,
zoom=7,),
title = 'Transaction location with Property Price',
)
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig, filename='Geolocation of properties')
import plotly
from plotly.plotly import iplot
import plotly.graph_objs as go
mapbox_access_token = 'pk.eyJ1IjoiYmlwYXJ0aXRlaHlwZXJjdWJlIiwiYSI6ImNqczFjZzUydjF0MGc0OW1sYWIwYW5lY2UifQ.l6OVeSa3poXp6S4s8km8kA'
map_style = "mapbox://styles/bipartitehypercube/cjs26r8sw1cy31fqjhopqigtj"
scl = [0,"rgb(0,0,0)"],[0.25,"rgb(0, 0, 128)"],[0.5,"rgb(0, 25, 255)"],\
[0.75,"rgb(0, 0, 205)"],[1,"rgb(250, 250, 250)"]
data = [go.Scattermapbox(
lat=trans_train['latitude'],
lon=trans_train['longitude'],
mode='markers',
marker = dict(color = (trans_train["calculatedfinishedsquarefeet"]),colorscale = scl,reversescale = True,opacity = 0.7,size = 5,
colorbar = dict(thickness = 10,titleside = "right",
outlinecolor = "rgba(68, 68, 68, 0)",
ticks = "outside",ticklen = 3,
showticksuffix = "last",
ticksuffix = " Square Feet",dtick = 700)))]
layout = go.Layout(
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(lat=34,lon=-118.5),
style= map_style,
pitch=0,
zoom=7,),
title = 'Transaction location with Property Size',
)
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig, filename='Geolocation of properties')
import plotly
from plotly.plotly import iplot
import plotly.graph_objs as go
mapbox_access_token = 'pk.eyJ1IjoiYmlwYXJ0aXRlaHlwZXJjdWJlIiwiYSI6ImNqczFjZzUydjF0MGc0OW1sYWIwYW5lY2UifQ.l6OVeSa3poXp6S4s8km8kA'
map_style = "mapbox://styles/bipartitehypercube/cjs26r8sw1cy31fqjhopqigtj"
scl = [0,"rgb(0,0,0)"],[0.25,"rgb(0, 0, 128)"],[0.5,"rgb(0, 25, 255)"],\
[0.75,"rgb(0, 0, 205)"],[1,"rgb(250, 250, 250)"]
data = [go.Scattermapbox(
lat=trans_train['latitude'],
lon=trans_train['longitude'],
mode='markers',
marker = dict(color = (trans_train['taxvaluedollarcnt']/trans_train["calculatedfinishedsquarefeet"]),colorscale = scl,reversescale = True,opacity = 0.7,size = 5,
colorbar = dict(thickness = 10,titleside = "right",
outlinecolor = "rgba(68, 68, 68, 0)",
ticks = "outside",ticklen = 3,
showticksuffix = "last",
ticksuffix = " Dollars Per Square Feet",dtick = 700)))]
layout = go.Layout(
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(lat=34,lon=-118.5),
style= map_style,
pitch=0,
zoom=7,),
title = 'Transaction location',
)
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig, filename='Geolocation of properties')
freq = df["yearbuilt"].value_counts()
freq = freq.values
freq = freq.tolist()
years = df["yearbuilt"].unique()
years = years.tolist()
years1 = transaction["yearbuilt"].unique()
years1 = years1.tolist()
freq1 = transaction["yearbuilt"].value_counts()
freq1 = freq1.values
freq1 = freq1.tolist()
ly0 = []
for i in years:
ly0.append(int(i))
ly1 = []
for i in years1:
ly1.append(int(i))
overall = pd.DataFrame({'yearbuilt': ly0,'Marketprop': freq})
transact = pd.DataFrame({'yearbuilt': ly1,'Transactionofprop': freq1})
comparison = pd.merge(overall,transact, on = 'yearbuilt', how = 'outer')
comparison = comparison.fillna(0)
#comparison['yearbuilt'] = comparison['yearbuilt'].apply(str)
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
column_names_to_normalize = ['Marketprop','Transactionofprop']
x = comparison[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = comparison.index)
comparison[column_names_to_normalize] = df_temp
#comparison[comparison["Transactionofprop"] > 0.8]
#comparison[comparison["Marketprop"] > 0.8]
The left plot (blue line) indicates that the overal supply of the market based on the age of the house is from 1900 to 2000s.
The left plot (orange line) indicates that transactions do overlay the supply of properties in the market. However, the two line plots spikes at different areas.
The orangle line in the left plot show that consumers like houses built around 1950s to 1960s and from 1980s to 1990s. This may indicate buying houses from 1960s for remodeling and 1980s for living.
The right plot shows the prediction error of housing prices by Zillow. The Zillow estimates are relatively accurate. By locating the high fluctions in prediction error we can understand that the house prices are highly volatile from the 1880s to 1940s.
import matplotlib.pyplot as plt
from matplotlib import pylab
import datetime
import numpy as np
import seaborn as sns; sns.set()
palette = dict(zip([1,2],
sns.color_palette("nipy_spectral_r", 4)))
fig, (ax1, ax2)= plt.subplots(ncols = 2,figsize=(16, 5))
ax1 = sns.lineplot(comparison['yearbuilt'],comparison["Marketprop"], data = comparison, palette = palette,ax=ax1, label = "Property Supply")
leg =ax1.legend();
ax1 = sns.lineplot(comparison['yearbuilt'],comparison["Transactionofprop"], data = comparison, palette = palette,ax=ax1, label = "Property Demand").set_title('Properties by Year Built')
ax2 = sns.lineplot(transaction['yearbuilt'],abs(transaction["logerror"]), data = transaction, ax = ax2, label = "Log Error").set_title("Logerror Fluctuations by Year Built")